import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
# Set the desired number of columns to display (e.g., display all columns)
pd.set_option('display.max_columns', None)
Type : Type of transaction made
Days for shipping (real) : Actual shipping days of the purchased product
Days for shipment (scheduled) : Days of scheduled delivery of the purchased product
Benefit per order : Earnings per order
Sales per customer : Total sales per customer made per customer
Delivery Status : Delivery status of orders: Advance shipping , Late delivery , Shipping canceled , Shipping on tim...
Late_delivery_risk : Categorical variable that indicates if sending is late (1), it is not late (0).
Category Id : Product category code
Category Name : Description of the product category
Customer City : City where the customer made the purchase
Customer Country : Country where the customer made the purchase
Customer Email : Customer's email
Customer Fname : Customer name
Customer Id : Customer ID
Customer Lname : Customer lastname
Customer Password : Masked customer key
Customer Segment : Types of Customers: Consumer , Corporate , Home Office
Customer State : State to which the store where the purchase is registered belongs
Customer Street : Street to which the store where the purchase is registered belongs
Customer Zipcode : Customer Zipcode
Department Id : Department code of store
Department Name : Department name of store
Latitude : Latitude corresponding to location of store
Longitude : Longitude corresponding to location of store
Market : Market to where the order is delivered : Africa , Europe , LATAM , Pacific Asia , USCA
Order City : Destination city of the order
Order Country : Destination country of the order
Order Customer Id : Customer order code
order date (DateOrders) : Date on which the order is made
Order Id : Order code
Order Item Cardprod Id : Product code generated through the RFID reader
Order Item Discount : Order item discount value
Order Item Discount Rate : Order item discount percentage
Order Item Id : Order item code
Order Item Product Price : Price of products without discount
Order Item Profit Ratio : Order Item Profit Ratio
Order Item Quantity : Number of products per order
Sales : Value in sales
Order Item Total : Total amount per order
Order Profit Per Order : Order Profit Per Order
Order Region : Region of the world where the order is delivered : Southeast Asia ,South Asia ,Oceania ,Eastern ...
Order State : State of the region where the order is delivered
Order Status : Order Status : COMPLETE , PENDING , CLOSED , PENDING_PAYMENT ,CANCELED , PROCESSING ,SUSPECTED_FR...
Product Card Id : Product code
Product Category Id : Product category code
Product Description : Product Description
Product Image : Link of visit and purchase of the product
Product Name : Product Name
Product Price : Product Price
Product Status : Status of the product stock :If it is 1 not available , 0 the product is available
Shipping date (DateOrders) : Exact date and time of shipment
Shipping Mode : The following shipping modes are presented : Standard Class , First Class , Second Class , Same D...
# Try reading the CSV file with different encoding
try:
df = pd.read_csv('supply_chain_data.csv', encoding='utf-8')
except UnicodeDecodeError:
df = pd.read_csv('supply_chain_data.csv', encoding='latin-1')
# Display the first few rows of the dataframe
df.head()
| Type | Days_for_shipping_(real) | Days_for_shipment_(scheduled) | Benefit_per_order | Sales_per_customer | Delivery_Status | Late_delivery_risk | Category_Id | Category_Name | Customer_City | Customer_Country | Customer_Email | Customer_Fname | Customer_Id | Customer_Lname | Customer_Password | Customer_Segment | Customer_State | Customer_Street | Customer_Zipcode | Department_Id | Department_Name | Latitude | Longitude | Market | Order_City | Order_Country | Order_Customer_Id | order_date_(DateOrders) | Order_Id | Order_Item_Cardprod_Id | Order_Item_Discount | Order_Item_Discount_Rate | Order_Item_Id | Order_Item_Product_Price | Order_Item_Profit_Ratio | Order_Item_Quantity | Sales | Order_Item_Total | Order_Profit_Per_Order | Order_Region | Order_State | Order_Status | Order_Zipcode | Product_Card_Id | Product_Category_Id | Product_Description | Product_Image | Product_Name | Product_Price | Product_Status | shipping_date_(DateOrders) | Shipping_Mode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEBIT | 3 | 4 | 91.250000 | 314.640015 | Advance shipping | 0 | 73 | Sporting Goods | Caguas | Puerto Rico | XXXXXXXXX | Cally | 20755 | Holloway | XXXXXXXXX | Consumer | PR | 5365 Noble Nectar Island | 725.0 | 2 | Fitness | 18.251453 | -66.037056 | Pacific Asia | Bekasi | Indonesia | 20755 | 1/31/2018 | 77202 | 1360 | 13.110000 | 0.04 | 180517 | 327.75 | 0.29 | 1 | 327.75 | 314.640015 | 91.25 | Southeast Asia | Java Occidental | COMPLETE | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 2/3/2018 | Standard Class |
| 1 | TRANSFER | 5 | 4 | -249.089996 | 311.359985 | Late delivery | 1 | 73 | Sporting Goods | Caguas | Puerto Rico | XXXXXXXXX | Irene | 19492 | Luna | XXXXXXXXX | Consumer | PR | 2679 Rustic Loop | 725.0 | 2 | Fitness | 18.279451 | -66.037064 | Pacific Asia | Bikaner | India | 19492 | 1/13/2018 | 75939 | 1360 | 16.389999 | 0.05 | 179254 | 327.75 | -0.80 | 1 | 327.75 | 311.359985 | -249.09 | South Asia | Rajastan | PENDING | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 1/18/2018 | Standard Class |
| 2 | CASH | 4 | 4 | -247.779999 | 309.720001 | Shipping on time | 0 | 73 | Sporting Goods | San Jose | EE. UU. | XXXXXXXXX | Gillian | 19491 | Maldonado | XXXXXXXXX | Consumer | CA | 8510 Round Bear Gate | 95125.0 | 2 | Fitness | 37.292233 | -121.881279 | Pacific Asia | Bikaner | India | 19491 | 1/13/2018 | 75938 | 1360 | 18.030001 | 0.06 | 179253 | 327.75 | -0.80 | 1 | 327.75 | 309.720001 | -247.78 | South Asia | Rajastan | CLOSED | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 1/17/2018 | Standard Class |
| 3 | DEBIT | 3 | 4 | 22.860001 | 304.809998 | Advance shipping | 0 | 73 | Sporting Goods | Los Angeles | EE. UU. | XXXXXXXXX | Tana | 19490 | Tate | XXXXXXXXX | Home Office | CA | 3200 Amber Bend | 90027.0 | 2 | Fitness | 34.125946 | -118.291016 | Pacific Asia | Townsville | Australia | 19490 | 1/13/2018 | 75937 | 1360 | 22.940001 | 0.07 | 179252 | 327.75 | 0.08 | 1 | 327.75 | 304.809998 | 22.86 | Oceania | Queensland | COMPLETE | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 1/16/2018 | Standard Class |
| 4 | PAYMENT | 2 | 4 | 134.210007 | 298.250000 | Advance shipping | 0 | 73 | Sporting Goods | Caguas | Puerto Rico | XXXXXXXXX | Orli | 19489 | Hendricks | XXXXXXXXX | Corporate | PR | 8671 Iron Anchor Corners | 725.0 | 2 | Fitness | 18.253769 | -66.037048 | Pacific Asia | Townsville | Australia | 19489 | 1/13/2018 | 75936 | 1360 | 29.500000 | 0.09 | 179251 | 327.75 | 0.45 | 1 | 327.75 | 298.250000 | 134.21 | Oceania | Queensland | PENDING_PAYMENT | NaN | 1360 | 73 | NaN | http://images.acmesports.sports/Smart+watch | Smart watch | 327.75 | 0 | 1/15/2018 | Standard Class |
df[df['Order_City']=='Sydney'][['Latitude','Longitude']]
| Latitude | Longitude | |
|---|---|---|
| 368 | 33.773952 | -117.961899 |
| 436 | 40.687035 | -73.624466 |
| 437 | 18.263319 | -66.037056 |
| 469 | 18.218719 | -66.370544 |
| 711 | 18.242109 | -66.370522 |
| ... | ... | ... |
| 180250 | 39.954800 | -76.757874 |
| 180286 | 34.245670 | -118.573418 |
| 180364 | 39.318817 | -84.495956 |
| 180437 | 18.033340 | -66.851921 |
| 180448 | 34.081654 | -118.178352 |
829 rows × 2 columns
df = df.rename(columns={'Sales': 'Order_Total_Before_Discount',
'Order_Item_Total': 'Order_Total_After_Discount',
'shipping_date_(DateOrders)': 'Shipping_Date',
'order_date_(DateOrders)': 'Order_Date',
'Type': 'Transaction_Type',
'Days_for_shipping_(real)': 'Days_for_Shipping_Real',
'Days_for_shipment_(scheduled)': 'Days_for_Shipping_Scheduled',
'Late_delivery_risk': 'Late_Delivery_Risk',
'Product_Card_Id': 'Product_Id',
'Order_Item_Discount_Rate': 'Order_Item_Discount_Percentage'})
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 180519 entries, 0 to 180518 Data columns (total 53 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Transaction_Type 180519 non-null object 1 Days_for_Shipping_Real 180519 non-null int64 2 Days_for_Shipping_Scheduled 180519 non-null int64 3 Benefit_per_order 180519 non-null float64 4 Sales_per_customer 180519 non-null float64 5 Delivery_Status 180519 non-null object 6 Late_Delivery_Risk 180519 non-null int64 7 Category_Id 180519 non-null int64 8 Category_Name 180519 non-null object 9 Customer_City 180519 non-null object 10 Customer_Country 180519 non-null object 11 Customer_Email 180519 non-null object 12 Customer_Fname 180519 non-null object 13 Customer_Id 180519 non-null int64 14 Customer_Lname 180511 non-null object 15 Customer_Password 180519 non-null object 16 Customer_Segment 180519 non-null object 17 Customer_State 180519 non-null object 18 Customer_Street 180519 non-null object 19 Customer_Zipcode 180516 non-null float64 20 Department_Id 180519 non-null int64 21 Department_Name 180519 non-null object 22 Latitude 180519 non-null float64 23 Longitude 180519 non-null float64 24 Market 180519 non-null object 25 Order_City 180519 non-null object 26 Order_Country 180519 non-null object 27 Order_Customer_Id 180519 non-null int64 28 Order_Date 180519 non-null object 29 Order_Id 180519 non-null int64 30 Order_Item_Cardprod_Id 180519 non-null int64 31 Order_Item_Discount 180519 non-null float64 32 Order_Item_Discount_Percentage 180519 non-null float64 33 Order_Item_Id 180519 non-null int64 34 Order_Item_Product_Price 180519 non-null float64 35 Order_Item_Profit_Ratio 180519 non-null float64 36 Order_Item_Quantity 180519 non-null int64 37 Order_Total_Before_Discount 180519 non-null float64 38 Order_Total_After_Discount 180519 non-null float64 39 Order_Profit_Per_Order 180519 non-null float64 40 Order_Region 180519 non-null object 41 Order_State 180519 non-null object 42 Order_Status 180519 non-null object 43 Order_Zipcode 24843 non-null float64 44 Product_Id 180519 non-null int64 45 Product_Category_Id 180519 non-null int64 46 Product_Description 0 non-null float64 47 Product_Image 180519 non-null object 48 Product_Name 180519 non-null object 49 Product_Price 180519 non-null float64 50 Product_Status 180519 non-null int64 51 Shipping_Date 180519 non-null object 52 Shipping_Mode 180519 non-null object dtypes: float64(15), int64(14), object(24) memory usage: 73.0+ MB
df.dtypes
Transaction_Type object Days_for_Shipping_Real int64 Days_for_Shipping_Scheduled int64 Benefit_per_order float64 Sales_per_customer float64 Delivery_Status object Late_Delivery_Risk int64 Category_Id int64 Category_Name object Customer_City object Customer_Country object Customer_Email object Customer_Fname object Customer_Id int64 Customer_Lname object Customer_Password object Customer_Segment object Customer_State object Customer_Street object Customer_Zipcode float64 Department_Id int64 Department_Name object Latitude float64 Longitude float64 Market object Order_City object Order_Country object Order_Customer_Id int64 Order_Date object Order_Id int64 Order_Item_Cardprod_Id int64 Order_Item_Discount float64 Order_Item_Discount_Percentage float64 Order_Item_Id int64 Order_Item_Product_Price float64 Order_Item_Profit_Ratio float64 Order_Item_Quantity int64 Order_Total_Before_Discount float64 Order_Total_After_Discount float64 Order_Profit_Per_Order float64 Order_Region object Order_State object Order_Status object Order_Zipcode float64 Product_Id int64 Product_Category_Id int64 Product_Description float64 Product_Image object Product_Name object Product_Price float64 Product_Status int64 Shipping_Date object Shipping_Mode object dtype: object
df.describe(include='all')
| Transaction_Type | Days_for_Shipping_Real | Days_for_Shipping_Scheduled | Benefit_per_order | Sales_per_customer | Delivery_Status | Late_Delivery_Risk | Category_Id | Category_Name | Customer_City | Customer_Country | Customer_Email | Customer_Fname | Customer_Id | Customer_Lname | Customer_Password | Customer_Segment | Customer_State | Customer_Street | Customer_Zipcode | Department_Id | Department_Name | Latitude | Longitude | Market | Order_City | Order_Country | Order_Customer_Id | Order_Date | Order_Id | Order_Item_Cardprod_Id | Order_Item_Discount | Order_Item_Discount_Percentage | Order_Item_Id | Order_Item_Product_Price | Order_Item_Profit_Ratio | Order_Item_Quantity | Order_Total_Before_Discount | Order_Total_After_Discount | Order_Profit_Per_Order | Order_Region | Order_State | Order_Status | Order_Zipcode | Product_Id | Product_Category_Id | Product_Description | Product_Image | Product_Name | Product_Price | Product_Status | Shipping_Date | Shipping_Mode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 180519 | 180519.000000 | 180519.000000 | 180519.000000 | 180519.000000 | 180519 | 180519.000000 | 180519.000000 | 180519 | 180519 | 180519 | 180519 | 180519 | 180519.000000 | 180511 | 180519 | 180519 | 180519 | 180519 | 180516.000000 | 180519.000000 | 180519 | 180519.000000 | 180519.000000 | 180519 | 180519 | 180519 | 180519.000000 | 180519 | 180519.000000 | 180519.000000 | 180519.000000 | 180519.000000 | 180519.000000 | 180519.000000 | 180519.000000 | 180519.000000 | 180519.000000 | 180519.000000 | 180519.000000 | 180519 | 180519 | 180519 | 24843.000000 | 180519.000000 | 180519.000000 | 0.0 | 180519 | 180519 | 180519.000000 | 180519.0 | 180519 | 180519 |
| unique | 4 | NaN | NaN | NaN | NaN | 4 | NaN | NaN | 50 | 563 | 2 | 1 | 782 | NaN | 1109 | 1 | 3 | 46 | 7458 | NaN | NaN | 11 | NaN | NaN | 5 | 3596 | 164 | NaN | 1127 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 23 | 1089 | 9 | NaN | NaN | NaN | NaN | 118 | 118 | NaN | NaN | 1131 | 4 |
| top | DEBIT | NaN | NaN | NaN | NaN | Late delivery | NaN | NaN | Cleats | Caguas | EE. UU. | XXXXXXXXX | Mary | NaN | Smith | XXXXXXXXX | Consumer | PR | 9126 Wishing Expressway | NaN | NaN | Fan Shop | NaN | NaN | LATAM | Santo Domingo | Estados Unidos | NaN | 7/2/2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Central America | Inglaterra | COMPLETE | NaN | NaN | NaN | NaN | http://images.acmesports.sports/Perfect+Fitnes... | Perfect Fitness Perfect Rip Deck | NaN | NaN | 9/27/2017 | Standard Class |
| freq | 69295 | NaN | NaN | NaN | NaN | 98977 | NaN | NaN | 24551 | 66770 | 111146 | 180519 | 65150 | NaN | 64104 | 180519 | 93504 | 69373 | 122 | NaN | NaN | 66861 | NaN | NaN | 51594 | 2211 | 24840 | NaN | 220 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 28341 | 6722 | 59491 | NaN | NaN | NaN | NaN | 24515 | 24515 | NaN | NaN | 266 | 107752 |
| mean | NaN | 3.497654 | 2.931847 | 21.974989 | 183.107609 | NaN | 0.548291 | 31.851451 | NaN | NaN | NaN | NaN | NaN | 6691.379495 | NaN | NaN | NaN | NaN | NaN | 35921.126914 | 5.443460 | NaN | 29.719955 | -84.915675 | NaN | NaN | NaN | 6691.379495 | NaN | 36221.894903 | 692.509764 | 20.664741 | 0.101668 | 90260.000000 | 141.232550 | 0.120647 | 2.127638 | 203.772096 | 183.107609 | 21.974989 | NaN | NaN | NaN | 55420.648070 | 692.509764 | 31.851451 | NaN | NaN | NaN | 141.232547 | 0.0 | NaN | NaN |
| std | NaN | 1.623722 | 1.374449 | 104.433526 | 120.043670 | NaN | 0.497664 | 15.640064 | NaN | NaN | NaN | NaN | NaN | 4162.918106 | NaN | NaN | NaN | NaN | NaN | 37542.461122 | 1.629246 | NaN | 9.813646 | 21.433241 | NaN | NaN | NaN | 4162.918106 | NaN | 21045.379569 | 336.446807 | 21.800901 | 0.070415 | 52111.490959 | 139.732492 | 0.466796 | 1.453451 | 132.273077 | 120.043670 | 104.433526 | NaN | NaN | NaN | 31921.252921 | 336.446807 | 15.640064 | NaN | NaN | NaN | 139.732489 | 0.0 | NaN | NaN |
| min | NaN | 0.000000 | 0.000000 | -4274.979980 | 7.490000 | NaN | 0.000000 | 2.000000 | NaN | NaN | NaN | NaN | NaN | 1.000000 | NaN | NaN | NaN | NaN | NaN | 603.000000 | 2.000000 | NaN | -33.937553 | -158.025986 | NaN | NaN | NaN | 1.000000 | NaN | 1.000000 | 19.000000 | 0.000000 | 0.000000 | 1.000000 | 9.990000 | -2.750000 | 1.000000 | 9.990000 | 7.490000 | -4274.980000 | NaN | NaN | NaN | 1040.000000 | 19.000000 | 2.000000 | NaN | NaN | NaN | 9.990000 | 0.0 | NaN | NaN |
| 25% | NaN | 2.000000 | 2.000000 | 7.000000 | 104.379997 | NaN | 0.000000 | 18.000000 | NaN | NaN | NaN | NaN | NaN | 3258.500000 | NaN | NaN | NaN | NaN | NaN | 725.000000 | 4.000000 | NaN | 18.265432 | -98.446312 | NaN | NaN | NaN | 3258.500000 | NaN | 18057.000000 | 403.000000 | 5.400000 | 0.040000 | 45130.500000 | 50.000000 | 0.080000 | 1.000000 | 119.980003 | 104.379997 | 7.000000 | NaN | NaN | NaN | 23464.000000 | 403.000000 | 18.000000 | NaN | NaN | NaN | 50.000000 | 0.0 | NaN | NaN |
| 50% | NaN | 3.000000 | 4.000000 | 31.520000 | 163.990005 | NaN | 1.000000 | 29.000000 | NaN | NaN | NaN | NaN | NaN | 6457.000000 | NaN | NaN | NaN | NaN | NaN | 19380.000000 | 5.000000 | NaN | 33.144863 | -76.847908 | NaN | NaN | NaN | 6457.000000 | NaN | 36140.000000 | 627.000000 | 14.000000 | 0.100000 | 90260.000000 | 59.990002 | 0.270000 | 1.000000 | 199.919998 | 163.990005 | 31.520000 | NaN | NaN | NaN | 59405.000000 | 627.000000 | 29.000000 | NaN | NaN | NaN | 59.990000 | 0.0 | NaN | NaN |
| 75% | NaN | 5.000000 | 4.000000 | 64.800003 | 247.399994 | NaN | 1.000000 | 45.000000 | NaN | NaN | NaN | NaN | NaN | 9779.000000 | NaN | NaN | NaN | NaN | NaN | 78207.000000 | 7.000000 | NaN | 39.279617 | -66.370583 | NaN | NaN | NaN | 9779.000000 | NaN | 54144.000000 | 1004.000000 | 29.990000 | 0.160000 | 135389.500000 | 199.990005 | 0.360000 | 3.000000 | 299.950012 | 247.399994 | 64.800000 | NaN | NaN | NaN | 90008.000000 | 1004.000000 | 45.000000 | NaN | NaN | NaN | 199.990000 | 0.0 | NaN | NaN |
| max | NaN | 6.000000 | 4.000000 | 911.799988 | 1939.989990 | NaN | 1.000000 | 76.000000 | NaN | NaN | NaN | NaN | NaN | 20757.000000 | NaN | NaN | NaN | NaN | NaN | 99205.000000 | 12.000000 | NaN | 48.781933 | 115.263077 | NaN | NaN | NaN | 20757.000000 | NaN | 77204.000000 | 1363.000000 | 500.000000 | 0.250000 | 180519.000000 | 1999.989990 | 0.500000 | 5.000000 | 1999.989990 | 1939.989990 | 911.800000 | NaN | NaN | NaN | 99301.000000 | 1363.000000 | 76.000000 | NaN | NaN | NaN | 1999.990000 | 0.0 | NaN | NaN |
null_values=df.isnull().sum()/len(df) * 100
null_values=null_values.reset_index()
null_values=null_values.rename(columns={'index':'variable',0:'null_values_percentage'})
null_values[null_values['null_values_percentage']>0].sort_values(by='null_values_percentage', ascending=False)
| variable | null_values_percentage | |
|---|---|---|
| 46 | Product_Description | 100.000000 |
| 43 | Order_Zipcode | 86.238014 |
| 14 | Customer_Lname | 0.004432 |
| 19 | Customer_Zipcode | 0.001662 |
Customer Zipcode: 3 missing values.
Order Zipcode: 155,679 missing values; not critical due to other location features.
Product Description: All values are missing; drop the column.
Customer LName: 8 missing values; not significant since Customer Id is used.
All other columns have no missing values
# Columns to drop because of similarity
cols_to_drop = ['Order_Customer_Id', 'Product_Category_Id', 'Order_Item_Cardprod_Id', 'Sales_per_customer', 'Benefit_per_order']
df.drop(columns=cols_to_drop,axis=1,inplace=True)
# Columns to drop because they are inutil
cols_to_drop = ['Customer_Email', 'Customer_Password', 'Product_Image', 'Product_Status']
df.drop(columns=cols_to_drop,axis=1,inplace=True)
# Dropped 'Product_Description' and 'Order_Zipcode' because of high null_values_percentage
df.drop(columns=['Product_Description','Order_Zipcode'],axis=1,inplace=True)
# Changing datatypes
cols_to_string = ['Category_Id', 'Customer_Id', 'Department_Id', 'Order_Id', 'Order_Item_Id', 'Product_Id']
for col in cols_to_string:
df[col]=df[col].astype('str')
cols_to_date = ['Order_Date', 'Shipping_Date']
for col in cols_to_date:
df[col]=pd.to_datetime(df[col])
# Dealing with null values
df.fillna('empty', inplace=True)
df.head()
| Transaction_Type | Days_for_Shipping_Real | Days_for_Shipping_Scheduled | Delivery_Status | Late_Delivery_Risk | Category_Id | Category_Name | Customer_City | Customer_Country | Customer_Fname | Customer_Id | Customer_Lname | Customer_Segment | Customer_State | Customer_Street | Customer_Zipcode | Department_Id | Department_Name | Latitude | Longitude | Market | Order_City | Order_Country | Order_Date | Order_Id | Order_Item_Discount | Order_Item_Discount_Percentage | Order_Item_Id | Order_Item_Product_Price | Order_Item_Profit_Ratio | Order_Item_Quantity | Order_Total_Before_Discount | Order_Total_After_Discount | Order_Profit_Per_Order | Order_Region | Order_State | Order_Status | Product_Id | Product_Name | Product_Price | Shipping_Date | Shipping_Mode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEBIT | 3 | 4 | Advance shipping | 0 | 73 | Sporting Goods | Caguas | Puerto Rico | Cally | 20755 | Holloway | Consumer | PR | 5365 Noble Nectar Island | 725.0 | 2 | Fitness | 18.251453 | -66.037056 | Pacific Asia | Bekasi | Indonesia | 2018-01-31 | 77202 | 13.110000 | 0.04 | 180517 | 327.75 | 0.29 | 1 | 327.75 | 314.640015 | 91.25 | Southeast Asia | Java Occidental | COMPLETE | 1360 | Smart watch | 327.75 | 2018-02-03 | Standard Class |
| 1 | TRANSFER | 5 | 4 | Late delivery | 1 | 73 | Sporting Goods | Caguas | Puerto Rico | Irene | 19492 | Luna | Consumer | PR | 2679 Rustic Loop | 725.0 | 2 | Fitness | 18.279451 | -66.037064 | Pacific Asia | Bikaner | India | 2018-01-13 | 75939 | 16.389999 | 0.05 | 179254 | 327.75 | -0.80 | 1 | 327.75 | 311.359985 | -249.09 | South Asia | Rajastan | PENDING | 1360 | Smart watch | 327.75 | 2018-01-18 | Standard Class |
| 2 | CASH | 4 | 4 | Shipping on time | 0 | 73 | Sporting Goods | San Jose | EE. UU. | Gillian | 19491 | Maldonado | Consumer | CA | 8510 Round Bear Gate | 95125.0 | 2 | Fitness | 37.292233 | -121.881279 | Pacific Asia | Bikaner | India | 2018-01-13 | 75938 | 18.030001 | 0.06 | 179253 | 327.75 | -0.80 | 1 | 327.75 | 309.720001 | -247.78 | South Asia | Rajastan | CLOSED | 1360 | Smart watch | 327.75 | 2018-01-17 | Standard Class |
| 3 | DEBIT | 3 | 4 | Advance shipping | 0 | 73 | Sporting Goods | Los Angeles | EE. UU. | Tana | 19490 | Tate | Home Office | CA | 3200 Amber Bend | 90027.0 | 2 | Fitness | 34.125946 | -118.291016 | Pacific Asia | Townsville | Australia | 2018-01-13 | 75937 | 22.940001 | 0.07 | 179252 | 327.75 | 0.08 | 1 | 327.75 | 304.809998 | 22.86 | Oceania | Queensland | COMPLETE | 1360 | Smart watch | 327.75 | 2018-01-16 | Standard Class |
| 4 | PAYMENT | 2 | 4 | Advance shipping | 0 | 73 | Sporting Goods | Caguas | Puerto Rico | Orli | 19489 | Hendricks | Corporate | PR | 8671 Iron Anchor Corners | 725.0 | 2 | Fitness | 18.253769 | -66.037048 | Pacific Asia | Townsville | Australia | 2018-01-13 | 75936 | 29.500000 | 0.09 | 179251 | 327.75 | 0.45 | 1 | 327.75 | 298.250000 | 134.21 | Oceania | Queensland | PENDING_PAYMENT | 1360 | Smart watch | 327.75 | 2018-01-15 | Standard Class |
cat_cols = ['Transaction_Type', 'Days_for_Shipping_Real', 'Days_for_Shipping_Scheduled', 'Delivery_Status',
'Category_Name', 'Customer_Country', 'Customer_Segment', 'Department_Name', 'Market',
'Order_Country', 'Order_Item_Quantity', 'Order_Region', 'Shipping_Mode']
df['Order_Item_Quantity'].value_counts()
Order_Item_Quantity 1 99134 5 20385 3 20350 4 20335 2 20315 Name: count, dtype: int64
sns.boxplot(df['Order_Item_Product_Price'])
<AxesSubplot:xlabel='Order_Item_Product_Price'>
sns.boxplot(df['Order_Total_After_Discount'])
<AxesSubplot:xlabel='Order_Total_After_Discount'>
sns.boxplot(df['Order_Profit_Per_Order'])
<AxesSubplot:xlabel='Order_Profit_Per_Order'>
df['Customer_Name'] = df['Customer_Fname'].astype(str)+df['Customer_Lname'].astype(str)
df.drop("Customer_Fname",axis=1,inplace=True)
df.drop("Customer_Lname",axis=1,inplace=True)
df['Order_Day'] = df['Order_Date'].dt.day
df['Order_Weekday'] = df['Order_Date'].dt.weekday
df['Order_Month'] = df['Order_Date'].dt.month
df['Order_Year'] = df['Order_Date'].dt.year
df['Day_Name'] = pd.to_datetime(df['Order_Date']).dt.day_name()
df['Month_Name'] = pd.to_datetime(df['Order_Date']).dt.month_name()
# Order_Item_Discount_Percentage
df['Order_Item_Discount_Percentage']= df['Order_Item_Discount_Percentage']*100
# Days_for_Shipping_Scheduled - Days_for_Shipping_Real
df['Shipping_Real_Scheduled_Diff'] = df['Days_for_Shipping_Scheduled'] - df['Days_for_Shipping_Real']
# df=df[df['Order_Year']!=2018]
df.drop('Order_Item_Id', axis=1).duplicated().sum()
0
order_cols=['Order_Id', 'Transaction_Type', 'Days_for_Shipping_Real', 'Days_for_Shipping_Scheduled',
'Delivery_Status', 'Customer_Id', 'Customer_Country', 'Customer_State', 'Order_Country', 'Customer_Segment',
'Order_Region', 'Order_Date', 'Shipping_Date', 'Shipping_Mode', 'Order_Day', 'Order_Weekday',
'Order_Month', 'Order_Year', 'Day_Name', 'Month_Name', 'Shipping_Real_Scheduled_Diff', 'Market']
df[order_cols].duplicated().sum()
114767
order_df = df[order_cols].drop_duplicates()
old_country_name = df['Order_Country'].unique()
new_country_name = ['Indonesia', 'India', 'Australia', 'China', 'Japan', 'South Korea', 'Singapore',
'Turkey', 'Mongolia', 'United States', 'Nigeria', 'Democratic Republic of the Congo',
'Senegal', 'Morocco', 'Germany', 'France', 'Netherlands', 'United Kingdom', 'Guatemala',
'El Salvador', 'Panama', 'Dominican Republic', 'Venezuela', 'Colombia', 'Honduras', 'Brazil',
'Mexico', 'Uruguay', 'Argentina', 'Cuba', 'Peru', 'Nicaragua', 'Ecuador', 'Angola', 'Sudan',
'Somalia', 'Ivory Coast', 'Egypt', 'Italy', 'Spain', 'Sweden', 'Austria', 'Canada', 'Madagascar',
'Algeria', 'Liberia', 'Zambia', 'Niger', 'South Africa', 'Mozambique', 'Tanzania', 'Rwanda',
'Israel', 'New Zealand', 'Bangladesh', 'Thailand', 'Iraq', 'Saudi Arabia', 'Philippines',
'Kazakhstan', 'Iran', 'Myanmar (Burma)', 'Uzbekistan', 'Benin', 'Cameroon', 'Kenya', 'Togo',
'Ukraine', 'Poland', 'Portugal', 'Romania', 'Trinidad and Tobago', 'Afghanistan', 'Pakistan',
'Vietnam', 'Malaysia', 'Finland', 'Russia', 'Ireland', 'Norway', 'Slovakia', 'Belgium', 'Bolivia',
'Chile', 'Jamaica', 'Yemen', 'Ghana', 'Guinea', 'Ethiopia', 'Bulgaria', 'Kyrgyzstan', 'Georgia',
'Nepal', 'United Arab Emirates', 'Cambodia', 'Uganda', 'Lesotho', 'Lithuania', 'Switzerland',
'Hungary', 'Denmark', 'Haiti', 'Belarus', 'Croatia', 'Laos', 'Bahrain', 'Macedonia', 'Czech Republic',
'Sri Lanka', 'Zimbabwe', 'Eritrea', 'Burkina Faso', 'Costa Rica', 'Libya', 'Barbados', 'Tajikistan',
'Syria', 'Guadeloupe', 'Papua New Guinea', 'Azerbaijan', 'Turkmenistan', 'Paraguay', 'Jordan',
'Hong Kong', 'Martinique', 'Moldova', 'Qatar', 'Mali', 'Albania', 'Republic of the Congo',
'Bosnia and Herzegovina', 'Oman', 'Tunisia', 'Sierra Leone', 'Djibouti', 'Burundi', 'Montenegro',
'Gabon', 'South Sudan', 'Luxembourg', 'Namibia', 'Mauritania', 'Greece', 'Eswatini', 'Guyana',
'French Guiana', 'Central African Republic', 'Taiwan', 'Estonia', 'Lebanon', 'Cyprus',
'Guinea-Bissau', 'Suriname', 'Belize', 'Slovenia', 'The Gambia', 'Botswana', 'Armenia',
'Equatorial Guinea', 'Kuwait', 'Bhutan', 'Chad', 'Serbia', 'Western Sahara']
for i,col in enumerate(old_country_name):
df['Order_Country'].replace(col, new_country_name[i], inplace=True)
df['Days_for_Shipping_Scheduled'] = df['Days_for_Shipping_Scheduled'].astype('str')
df['Days_for_Shipping_Real'] = df['Days_for_Shipping_Real'].astype('str')
df['Shipping_Real_Scheduled_Diff'] = df['Shipping_Real_Scheduled_Diff'].astype('str')
df['Order_Day'] = df['Order_Day'].astype('str')
df['Order_Weekday'] = df['Order_Weekday'].astype('str')
df['Order_Month'] = df['Order_Month'].astype('str')
df['Order_Year'] = df['Order_Year'].astype('str')
category_wise_sales = df.groupby('Category_Name')['Order_Total_Before_Discount'].sum().reset_index()
fig = px.bar(category_wise_sales, x='Category_Name',
y='Order_Total_Before_Discount',
title='Aggregate Sales by Category')
# Set axis' labels
fig.update_layout(yaxis_title='Sales')
fig.update_layout(xaxis_title='Category')
fig.show()
sales_trend = df.groupby('Order_Date')['Order_Total_Before_Discount'].sum().reset_index(name='Sales')
# Create a line plot
fig = px.line(sales_trend, x='Order_Date', y='Sales', title='Sales Trend Over Date')
fig.update_layout(xaxis_title='Date')
# Show the plot
fig.show()
order_count_by_category=df.groupby(['Category_Name'])['Order_Item_Id'].count().reset_index(name='Number of Orders').sort_values(by= 'Number of Orders', ascending= True)
px.bar(order_count_by_category, y='Number of Orders',x = 'Category_Name',color ='Number of Orders')
data = df.groupby(['Order_Country', 'Order_City'])['Order_Profit_Per_Order'].\
sum().reset_index(name='Profit_of_Orders').\
sort_values(by='Profit_of_Orders', ascending=False)
data_with_coords = data.merge(
df[['Order_Country', 'Order_City', 'Latitude', 'Longitude']].drop_duplicates(),
on=['Order_Country', 'Order_City'],
how='left' # Use a left join to keep all rows from the data DataFrame
)
data_with_coords.head()
| Order_Country | Order_City | Profit_of_Orders | Latitude | Longitude | |
|---|---|---|---|---|---|
| 0 | Dominican Republic | Santo Domingo | 51111.67 | 18.291080 | -66.370506 |
| 1 | Dominican Republic | Santo Domingo | 51111.67 | 18.256718 | -66.370628 |
| 2 | Dominican Republic | Santo Domingo | 51111.67 | 18.247389 | -66.370575 |
| 3 | Dominican Republic | Santo Domingo | 51111.67 | 40.640594 | -73.975189 |
| 4 | Dominican Republic | Santo Domingo | 51111.67 | 18.265968 | -66.370636 |
data_with_coords_unique = data_with_coords.drop_duplicates(
subset=['Order_Country', 'Order_City'],
keep='first' # Keep the first occurrence of each unique pair
)
data_with_coords_unique.head()
| Order_Country | Order_City | Profit_of_Orders | Latitude | Longitude | |
|---|---|---|---|---|---|
| 0 | Dominican Republic | Santo Domingo | 51111.67 | 18.291080 | -66.370506 |
| 694 | United States | New York City | 47889.76 | 18.287645 | -66.370613 |
| 1383 | Honduras | Tegucigalpa | 40973.64 | 18.296848 | -66.370506 |
| 1925 | United States | Los Angeles | 38014.36 | 18.262682 | -66.370499 |
| 2519 | Nicaragua | Managua | 34319.95 | 18.241207 | -66.370506 |
# Aggregate data by country and compute total profit sum
country_profit_sum = data_with_coords_unique.groupby('Order_Country')['Profit_of_Orders'].sum().reset_index()
# Create a choropleth map
fig = px.choropleth(country_profit_sum,
locations='Order_Country',
locationmode='country names',
color='Profit_of_Orders',
title='Total Profit by Country',
hover_name='Order_Country',
color_continuous_scale=px.colors.sequential.Plasma)
# Customize the layout
fig.update_geos(projection_type="natural earth")
fig.update_layout(coloraxis_colorbar=dict(title='Total Profit'))
# Show the plot
fig.show()
# Create a scatter plot
fig = px.scatter_geo(data_with_coords_unique,
lat='Latitude',
lon='Longitude',
color='Profit_of_Orders',
hover_name='Order_City',
hover_data=['Order_Country', 'Profit_of_Orders'],
title='Profit Distribution Across Countries and Cities')
# Customize the layout
fig.update_geos(projection_type="natural earth")
fig.update_layout(coloraxis_colorbar=dict(title='Profit'))
# Show the plot
fig.show()
market_revenue = df.groupby('Market')['Order_Total_After_Discount'].sum()
region_revenue = df.groupby('Order_Region')['Order_Total_After_Discount'].sum()
fig1 = px.bar(
market_revenue.sort_values(ascending=False).reset_index(name='Revenue'),
x='Market',
y='Revenue',
title="Total Revenue for all markets",
template='plotly_dark',
width=800,
height=600
)
fig1.show()
fig2 = px.bar(
region_revenue.sort_values(ascending=False).reset_index(name='Revenue'),
x='Order_Region',
y='Revenue',
title="Total sales for all regions",
template='plotly_dark',
width=800,
height=600
)
fig2.show()
loss = df[df['Order_Profit_Per_Order'] < 0]
# Products with most loss Visualization
loss_by_category = loss['Category_Name'].value_counts().nlargest(10).reset_index()
loss_by_category.columns = ['Category Name', 'Count']
fig1 = px.bar(
loss_by_category,
x='Category Name',
y='Count',
title='Products with Most Loss',
labels={'Category Name': 'Product Category', 'Count': 'Count'},
template='plotly_dark',
width=800,
height=600
)
fig1.show()
# Region with most loss Visualization
loss_by_region = loss['Order_Region'].value_counts().nlargest(10).reset_index()
loss_by_region.columns = ['Order Region', 'Count']
fig2 = px.bar(
loss_by_region,
x='Order Region',
y='Count',
title='Regions with Most Loss',
labels={'Order Region': 'Region', 'Count': 'Count'},
template='plotly_dark',
width=800,
height=600
)
fig2.show()
total_loss = loss['Order_Profit_Per_Order'].sum()
print("Total loss: ", total_loss)
Total loss: -3883547.35
Loss may be due to late deliveries and fraud.
late_delivery_data = df[df['Delivery_Status'] == 'Late delivery']
late_by_product = late_delivery_data['Product_Name'].value_counts().nlargest(10).reset_index()
late_by_product.columns = ['Product_Name', 'Late Deliveries']
fig1 = px.bar(
late_by_product,
x='Product_Name',
y='Late Deliveries',
title='Top 10 Products with Late Deliveries',
labels={'Product_Name': 'Product', 'Late Deliveries': 'Late Delivery Count'},
template='plotly_dark',
width=800,
height=600
)
fig1.show()
late_by_product = late_delivery_data['Category_Name'].value_counts().nlargest(10).reset_index()
late_by_product.columns = ['Category_Name', 'Late Deliveries']
fig2 = px.bar(
late_by_product,
x='Category_Name',
y='Late Deliveries',
title='Top 10 Product Categories with Late Deliveries',
labels={'Category_Name': 'Category', 'Late Deliveries': 'Late Delivery Count'},
template='plotly_dark',
width=800,
height=600
)
fig2.show()
late_by_region_shipment = late_delivery_data.groupby(['Order_Region', 'Shipping_Mode']).size().reset_index(name='Late Deliveries')
late_by_region_shipment = late_by_region_shipment.sort_values(by='Late Deliveries', ascending=False)
fig = px.bar(
late_by_region_shipment,
x='Order_Region',
y='Late Deliveries',
color='Shipping_Mode',
barmode='group',
title='Late Deliveries by Region and Shipment Type',
labels={'Order_Region': 'Region', 'Late Deliveries': 'Late Delivery Count'},
template='plotly_dark',
)
# Show the plot
fig.show()
fraud_data = df[(df['Order_Status'] == 'SUSPECTED_FRAUD') & (df['Transaction_Type'] == 'TRANSFER')]
fraud_data.shape
(4062, 48)
fraud_data = df[(df['Order_Status'] == 'SUSPECTED_FRAUD') ]
fraud_data.shape
(4062, 48)
fraud_data = df[(df['Order_Status'] == 'SUSPECTED_FRAUD') & (df['Transaction_Type'] == 'TRANSFER')]
fraud_by_region = fraud_data['Order_Region'].value_counts().reset_index()
fraud_by_region.columns = ['Order_Region', 'Count']
fraud_by_region = fraud_by_region.sort_values(by='Count', ascending=False)
fig = px.bar(
fraud_by_region,
x='Order_Region',
y='Count',
title='Regions with Highest Fraud',
labels={'Order_Region': 'Region', 'Count': 'Fraud Count'},
template='plotly_dark',
width=800,
height=600
)
fig.show()
suspected_fraud_data = df[df['Order_Status'] == 'SUSPECTED_FRAUD']
fraud_by_customer = suspected_fraud_data['Customer_Name'].value_counts().nlargest(10).reset_index()
fraud_by_customer.columns = ['Customer_Name', 'Fraud Count']
fig = px.bar(
fraud_by_customer,
x='Customer_Name',
y='Fraud Count',
title='Top 10 Customers Causing Fraud',
labels={'Customer_Name': 'Customer', 'Fraud Count': 'Fraud Count'},
template='plotly_dark',
width=800,
height=600
)
fig.show()
df_numeric = df.select_dtypes(include='number')
corr_matrix = df_numeric.corr()
plt.figure(figsize=(20, 14))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, annot_kws={"size": 15})
plt.title('Correlation Matrix')
plt.show()
We can see that 'Order_Item_Product_Price' and 'Product_Price' are same.
df.drop('Order_Item_Product_Price', axis=1, inplace=True)
Multicollinearity:
Multicollinearity occurs when two or more predictor(Independent) variables in a regression model are highly correlated. This can lead to several problems:
Technique-1 (Linear/Logistic Regression)
Just perform linear regression or logistic regression and note down the weights.
Add some noise in the data like 0.01 or 0.1 to each value in the columns, then perform linear or logistic regression and check the weights.
Features weights which has changed drastically(Compare with previously noted weights) are the rows facing multicollinearity.Baically, weights represent the features. So, check the weights and remove the features which have large changes.
Technique-2 (Variance Inflation Factor)
We check the VIF values and after that we have couple of options that we can use for Handling Multicolinearity:
Technique-3 (Lasso Regression)
Lasso (Least Absolute Shrinkage and Selection Operator) regression tackles multicollinearity by shrinking the coefficients of some features towards zero. It achieves this by adding a penalty term to the cost function that penalizes the absolute value of the coefficients. Here's how it helps:
Coefficient Shrinkage: By shrinking coefficients, Lasso can effectively reduce the influence of highly correlated features on the model, making the coefficients more reliable.
Feature Selection: In some cases, Lasso may even shrink coefficients to zero, essentially removing those features from the model. This can be helpful for identifying irrelevant or redundant features.
Technique-4 (Ridge Regression)
Ridge regression, on the other hand, addresses multicollinearity by shrinking the coefficients, but it uses a penalty term based on the squared value of the coefficients (L2 norm). Compared to Lasso:
Choosing Between Lasso and Ridge Regression:
The choice between Lasso and Ridge regression depends on your specific goals:
Feature Selection: If your primary goal is to identify the most important features and potentially reduce model complexity, Lasso might be a better choice.
Model Stability: If your main concern is improving model stability and reducing the impact of multicollinearity, Ridge regression could be sufficient.
Here are some additional points to consider:
Both techniques require tuning a hyperparameter (lambda) that controls the amount of shrinkage. You can evaluate the performance of models with Lasso and Ridge regression using techniques like cross-validation to see which one performs better for your data and goals.
# from statsmodels.stats.outliers_influence import variance_inflation_factor
# AttributeError: module 'numpy' has no attribute 'MachAr'
#-> Getting the error while importing variance_inflation_factor
#->need to downgrade nupy version to 1.23 or 1.22
# dependencies are
# python -> numpy -> scipy -> pandas -> statsmodelsb
# compatibility requires that they are released at around the same time or have age sequence the same as dependency sequence.
# !pip uninstall numpy -y
import numpy as np
import statsmodels
# Check numpy version
print(f"NumPy version: {np.__version__}")
# Check statsmodels version
print(f"statsmodels version: {statsmodels.__version__}")
NumPy version: 1.23.0 statsmodels version: 0.14.1
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Define the feature names as a list
features = ['Order_Item_Discount',
'Order_Item_Discount_Percentage', 'Order_Item_Profit_Ratio',
'Order_Item_Quantity', 'Order_Total_Before_Discount', 'Order_Total_After_Discount',
'Order_Profit_Per_Order', 'Product_Price']
# Calculate VIF for each feature
vif = pd.DataFrame(columns=['Feature', 'VIF'])
for i, feature in enumerate(features):
vif.loc[i] = (feature, variance_inflation_factor(df[features].values, i))
# Format VIF values with 2 decimal places
vif['VIF'] = vif['VIF'].apply(lambda x: f"{x:.2f}")
# Print the formatted DataFrame
print(vif.to_string(index=False))
Feature VIF
Order_Item_Discount 389415821.95
Order_Item_Discount_Percentage 6.13
Order_Item_Profit_Ratio 3.41
Order_Item_Quantity 12.81
Order_Total_Before_Discount 25472272275.35
Order_Total_After_Discount 20690128715.48
Order_Profit_Per_Order 3.41
Product_Price 24.87
Several of columns have vif values above 5 and some of them are very high. Such high value usually indicate that value in one column can be derived from values in other columns. Lets look at the data and see if we can find the relation.
When you do some basic math you will find these below relationship among these columns:-
These are perfect multi colinear features that shouldn't be passed on to regression models because it affects the performance of the model. You can drop one of the features from each of the above relationship.
cols_to_drop = ['Product_Price', 'Order_Total_After_Discount', 'Order_Item_Discount']
df.drop(columns=cols_to_drop, axis=1, inplace=True)
# Define the feature names as a list
features = [ 'Order_Item_Discount_Percentage', 'Order_Item_Profit_Ratio',
'Order_Item_Quantity', 'Order_Total_Before_Discount',
'Order_Profit_Per_Order']
# Calculate VIF for each feature
vif = pd.DataFrame(columns=['Feature', 'VIF'])
for i, feature in enumerate(features):
vif.loc[i] = (feature, variance_inflation_factor(df[features].values, i))
# Format VIF values with 2 decimal places
vif['VIF'] = vif['VIF'].apply(lambda x: f"{x:.2f}")
# Print the formatted DataFrame
print(vif.to_string(index=False))
Feature VIF
Order_Item_Discount_Percentage 2.22
Order_Item_Profit_Ratio 3.37
Order_Item_Quantity 2.44
Order_Total_Before_Discount 2.60
Order_Profit_Per_Order 3.39
daily_sales = df.groupby(['Product_Name', 'Order_Date'])['Order_Item_Quantity'].sum().reset_index(name='Quantity_Sold')
avg_daily_sales = daily_sales.groupby('Product_Name')['Quantity_Sold'].mean().nlargest(10).reset_index(name='Avg_Quantity_Sold')
avg_daily_sales = avg_daily_sales.sort_values(by='Avg_Quantity_Sold', ascending=False)
fig = px.bar(
avg_daily_sales,
x='Product_Name',
y='Avg_Quantity_Sold',
title='Most In-Demand Products',
labels={'Product_Name': 'Product', 'Avg_Quantity_Sold': 'Average Quantity Sold (Daily)'},
template='plotly_dark',
width=800,
height=600
)
fig.show()
most_popular_product = avg_daily_sales['Product_Name'][0]
popular_product_df = df[df['Product_Name'] == most_popular_product]
statewise_sales = popular_product_df.groupby(['Customer_State', 'Order_Date'])['Order_Item_Quantity'].sum().reset_index(name='Quantity_Sold')
Minimum order quantity is the fewest number of units a business is willing to sell to a single customer at once. While a retail store may be happy to sell a single t-shirt or one head of lettuce, it isn’t usually profitable to sell a single unit. They may require a minimum order of hundreds or thousands of units, depending on the product.
Many manufacturers sell in bulk to wholesalers, who then sell in bulk to retailers using an MOQ. Then individuals can go to the store to buy just one or two of a product.
Concept: Identify an optimal minimum order quantity for each product to minimize ordering costs and holding costs.
The MOQ calculation focuses on determining the minimum number of units you should order to avoid stockouts while minimizing costs.
moq = avg_daily_sales * lead_time_avg * safety_factor
The MOQ formula typically involves multiplying avg_daily_sales by the lead time (in days) and a safety factor. This calculation results in the number of units you should order to cover your average daily demand during the lead time, plus a buffer for potential fluctuations.
Data Analysis:
Calculate average daily sales for each product by grouping by Product_Name and calculating the mean of Sales.
Analyze the distribution of Days_For_Shipping_Real to understand lead time variability.
Consider factors like storage space and product shelf life.
# Calculate average daily sales(Qunatity_Sold) per state for most popular product
statewise_avg_daily_sales = avg_daily_sales.sort_values(by='Avg_Quantity_Sold', ascending=False)
statewise_avg_daily_sales = statewise_sales.groupby('Customer_State')['Quantity_Sold'].mean().reset_index(name='Avg_Quantity_Sold')
# Analyze Days_For_Shipping_Real distribution (replace with your analysis)
lead_time_avg = popular_product_df['Days_for_Shipping_Real'].astype('int').mean()
# MOQ formula
# moq = avg_daily_sales * lead_time_avg * safety_factor # Add safety factor based on lead time
# Example using a safety factor of 1.5
safety_factor = 1.5
# Calculate MOQ for each state
statewise_avg_daily_sales['MOQ'] = statewise_avg_daily_sales['Avg_Quantity_Sold'] * lead_time_avg * safety_factor
Safety_Stock referal : https://www.netsuite.com/portal/resource/articles/inventory-management/safety-stock.shtml
Safety Stock:
Safety stock is the extra inventory a company keeps on hand to buffer against uncertainties in demand or supply.
Reorder Point:
Reorder point is the inventory level at which a new order should be placed to replenish stock before it runs out.
The reorder point is calculated based on the average daily sales and the safety stock level. It ensures that there is enough stock on hand to cover the lead time (the time it takes for the new order to arrive) and the safety stock level to prevent stockouts during unexpected fluctuations in demand or delays in replenishment.
The time it takes for a new order to arrive, also known as lead time, can vary depending on factors such as supplier location, shipping method, and order processing time. To calculate the lead time, you typically need historical data on past orders to determine the average time it takes from the moment an order is placed until the products are received and available for sale.
most_popular_product = avg_daily_sales['Product_Name'][0]
popular_product_df = df[df['Product_Name'] == most_popular_product].reset_index(drop=True)
popular_product_df['Days_for_Shipping_Real'] = popular_product_df['Days_for_Shipping_Real']\
.astype('int')
# Calculate mean of order quantity for each state
statewise_avg_quantity_sold = popular_product_df.groupby('Customer_State')['Order_Item_Quantity'].mean().reset_index(name='Daily_Avg_Sales')
# Calculate mean shipping days for each state
statewise_shipping_days = popular_product_df.groupby('Customer_State')['Days_for_Shipping_Real'].mean().reset_index(name='LeadTime')
# Calculate standard deviation of order quantity for each state
statewise_quantity_stddev = popular_product_df.groupby('Customer_State')['Order_Item_Quantity'].std().reset_index(name='DailySales_StdDev')
# Merge the all three DataFrames
statewise_stats = pd.merge(pd.merge(statewise_avg_quantity_sold, statewise_shipping_days, on='Customer_State'), statewise_quantity_stddev, on='Customer_State')
# Desired service level factor (Z) for a given service level (e.g., 95% service level)
# Typically, Z = 1.96 for 95% service level
Z = 1.96
# Calculate safety stock
statewise_stats['Safety_Stock'] = Z * statewise_stats['DailySales_StdDev'] * \
np.sqrt(statewise_stats['LeadTime'])
statewise_stats['Reorder_Point'] = statewise_stats['LeadTime'] * \
statewise_stats['Daily_Avg_Sales'] + statewise_stats['Safety_Stock']
# Display state-wise safety stock
print(statewise_stats[['Customer_State', 'Safety_Stock', 'Reorder_Point']])
Customer_State Safety_Stock Reorder_Point 0 AL 1.847906 6.292350 1 AR 4.953387 14.447787 2 AZ 5.005876 15.072364 3 CA 5.199274 15.750481 4 CO 5.067911 15.006039 5 CT 5.141123 15.310288 6 DC 5.474476 16.437770 7 DE 5.621392 16.017184 8 FL 5.137536 15.620443 9 GA 5.237486 15.942738 10 HI 5.206693 16.375080 11 IA 3.503110 11.169776 12 ID 5.273797 16.767741 13 IL 5.242699 15.907156 14 IN 4.885811 13.918897 15 KS 4.813851 14.706695 16 KY 5.085004 15.893683 17 LA 5.546239 15.923864 18 MA 5.465095 16.254906 19 MD 4.978552 15.092150 20 MI 5.242294 15.620611 21 MN 5.852714 17.741603 22 MO 4.779152 14.513583 23 MT 3.596665 10.637481 24 NC 5.387914 15.896035 25 ND 5.330033 15.392533 26 NJ 5.378815 16.448263 27 NM 5.374584 17.458438 28 NV 5.063698 16.566630 29 NY 5.107072 15.464998 30 OH 5.233929 15.418759 31 OK 5.163685 15.881463 32 OR 5.303812 15.572127 33 PA 5.288551 16.040335 34 PR 5.143070 15.652304 35 RI 5.041681 14.324561 36 SC 5.623906 16.510012 37 TN 5.402437 16.621929 38 TX 5.148630 15.918405 39 UT 5.154018 16.168981 40 VA 5.274808 16.236052 41 WA 5.683994 16.771581 42 WI 5.187228 15.967831 43 WV 4.810861 14.306844
cols_to_drop = ['Market', 'Product_Name', 'Order_Id', 'Customer_Street', 'Customer_State',
'Customer_City', 'Customer_Country', 'Customer_Id', 'Order_Region', 'Department_Name',
'Category_Name', 'Day_Name', 'Month_Name', 'Customer_Name',
'Shipping_Real_Scheduled_Diff']
df_ship = df.drop(columns=cols_to_drop, axis=1)
df_ship.head(5)
| Transaction_Type | Days_for_Shipping_Real | Days_for_Shipping_Scheduled | Delivery_Status | Late_Delivery_Risk | Category_Id | Customer_Segment | Customer_Zipcode | Department_Id | Latitude | Longitude | Order_City | Order_Country | Order_Date | Order_Item_Discount_Percentage | Order_Item_Id | Order_Item_Profit_Ratio | Order_Item_Quantity | Order_Total_Before_Discount | Order_Profit_Per_Order | Order_State | Order_Status | Product_Id | Shipping_Date | Shipping_Mode | Order_Day | Order_Weekday | Order_Month | Order_Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEBIT | 3 | 4 | Advance shipping | 0 | 73 | Consumer | 725.0 | 2 | 18.251453 | -66.037056 | Bekasi | Indonesia | 2018-01-31 | 4.0 | 180517 | 0.29 | 1 | 327.75 | 91.25 | Java Occidental | COMPLETE | 1360 | 2018-02-03 | Standard Class | 31 | 2 | 1 | 2018 |
| 1 | TRANSFER | 5 | 4 | Late delivery | 1 | 73 | Consumer | 725.0 | 2 | 18.279451 | -66.037064 | Bikaner | India | 2018-01-13 | 5.0 | 179254 | -0.80 | 1 | 327.75 | -249.09 | Rajastan | PENDING | 1360 | 2018-01-18 | Standard Class | 13 | 5 | 1 | 2018 |
| 2 | CASH | 4 | 4 | Shipping on time | 0 | 73 | Consumer | 95125.0 | 2 | 37.292233 | -121.881279 | Bikaner | India | 2018-01-13 | 6.0 | 179253 | -0.80 | 1 | 327.75 | -247.78 | Rajastan | CLOSED | 1360 | 2018-01-17 | Standard Class | 13 | 5 | 1 | 2018 |
| 3 | DEBIT | 3 | 4 | Advance shipping | 0 | 73 | Home Office | 90027.0 | 2 | 34.125946 | -118.291016 | Townsville | Australia | 2018-01-13 | 7.0 | 179252 | 0.08 | 1 | 327.75 | 22.86 | Queensland | COMPLETE | 1360 | 2018-01-16 | Standard Class | 13 | 5 | 1 | 2018 |
| 4 | PAYMENT | 2 | 4 | Advance shipping | 0 | 73 | Corporate | 725.0 | 2 | 18.253769 | -66.037048 | Townsville | Australia | 2018-01-13 | 9.0 | 179251 | 0.45 | 1 | 327.75 | 134.21 | Queensland | PENDING_PAYMENT | 1360 | 2018-01-15 | Standard Class | 13 | 5 | 1 | 2018 |
Dropping because
Customer Id and Order Id doesn't contain extra information
Customer Zipcode can be used instead of 'Customer State', 'Category Name', 'Customer City', 'Customer Country'
Order Region doesn't provide extra information
Department_Id = Department_Name and 'Category_Id' = 'Category_Name'
'Order_Weekday' = 'Day_Name' and 'Order_Month' = 'Month_Name'
df_ship = df_ship[df_ship['Customer_Zipcode']!='empty']
# Change to needed datatypes
df_ship['Days_for_Shipping_Scheduled'] = df_ship['Days_for_Shipping_Scheduled'].astype('int')
df_ship['Days_for_Shipping_Real'] = df_ship['Days_for_Shipping_Real'].astype('int')
df_ship['Order_Day'] = df_ship['Order_Day'].astype('int')
df_ship['Order_Weekday'] = df_ship['Order_Weekday'].astype('int')
df_ship['Order_Month'] = df_ship['Order_Month'].astype('int')
df_ship['Order_Year'] = df_ship['Order_Year'].astype('int')
df_ship['Customer_Zipcode'] = df_ship['Customer_Zipcode'].astype('int')
df_ship['Order_Item_Id'] = df_ship['Order_Item_Id'].astype('int')
df_ship['Product_Id'] = df_ship['Product_Id'].astype('int')
df_ship['Shipping_Year'] = pd.to_datetime(df_ship['Shipping_Date']).dt.year
df_ship['Shipping_Month'] = pd.to_datetime(df_ship['Shipping_Date']).dt.month
df_ship['Shipping_Day'] = pd.to_datetime(df_ship['Shipping_Date']).dt.day
df_ship.drop(columns=['Order_Date', 'Shipping_Date'], inplace=True)
df_ship.dtypes
Transaction_Type object Days_for_Shipping_Real int32 Days_for_Shipping_Scheduled int32 Delivery_Status object Late_Delivery_Risk int64 Category_Id object Customer_Segment object Customer_Zipcode int32 Department_Id object Latitude float64 Longitude float64 Order_City object Order_Country object Order_Item_Discount_Percentage float64 Order_Item_Id int32 Order_Item_Profit_Ratio float64 Order_Item_Quantity int64 Order_Total_Before_Discount float64 Order_Profit_Per_Order float64 Order_State object Order_Status object Product_Id int32 Shipping_Mode object Order_Day int32 Order_Weekday int32 Order_Month int32 Order_Year int32 Shipping_Year int32 Shipping_Month int32 Shipping_Day int32 dtype: object
numerical_cols = df_ship.select_dtypes(include = ['float', 'int', 'int64']).columns.tolist()
categorical_cols = df_ship.select_dtypes(include = ['object']).columns.tolist()
print("Numerical Features: ", numerical_cols)
print()
print("Categorical Features: ", categorical_cols)
Numerical Features: ['Days_for_Shipping_Real', 'Days_for_Shipping_Scheduled', 'Late_Delivery_Risk', 'Customer_Zipcode', 'Latitude', 'Longitude', 'Order_Item_Discount_Percentage', 'Order_Item_Id', 'Order_Item_Profit_Ratio', 'Order_Item_Quantity', 'Order_Total_Before_Discount', 'Order_Profit_Per_Order', 'Product_Id', 'Order_Day', 'Order_Weekday', 'Order_Month', 'Order_Year', 'Shipping_Year', 'Shipping_Month', 'Shipping_Day'] Categorical Features: ['Transaction_Type', 'Delivery_Status', 'Category_Id', 'Customer_Segment', 'Department_Id', 'Order_City', 'Order_Country', 'Order_State', 'Order_Status', 'Shipping_Mode']
def cross_tab(dataframe, col, columns_list):
new = []
for c in columns_list:
new.append(pd.crosstab(dataframe[col], dataframe[c], margins = True, margins_name = 'Total'))
new_df = pd.concat(new, axis = 1, keys = columns_list)
return new_df
cross_tab(df, 'Category_Name', ['Delivery_Status', 'Customer_Segment', 'Shipping_Mode', 'Department_Name'])
| Delivery_Status | Customer_Segment | Shipping_Mode | Department_Name | |||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Advance shipping | Late delivery | Shipping canceled | Shipping on time | Total | Consumer | Corporate | Home Office | Total | First Class | Same Day | Second Class | Standard Class | Total | Apparel | Book Shop | Discs Shop | Fan Shop | Fitness | Footwear | Golf | Health and Beauty | Outdoors | Pet Shop | Technology | Total | |
| Category_Name | ||||||||||||||||||||||||||
| Accessories | 406 | 1014 | 83 | 277 | 1780 | 901 | 537 | 342 | 1780 | 290 | 88 | 360 | 1042 | 1780 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1780 | 0 | 0 | 1780 |
| As Seen on TV! | 15 | 39 | 2 | 12 | 68 | 31 | 22 | 15 | 68 | 10 | 3 | 14 | 41 | 68 | 0 | 0 | 0 | 0 | 0 | 68 | 0 | 0 | 0 | 0 | 0 | 68 |
| Baby | 49 | 109 | 9 | 40 | 207 | 107 | 72 | 28 | 207 | 34 | 27 | 29 | 117 | 207 | 207 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 207 |
| Baseball & Softball | 141 | 349 | 25 | 117 | 632 | 337 | 188 | 107 | 632 | 105 | 30 | 133 | 364 | 632 | 0 | 0 | 0 | 0 | 632 | 0 | 0 | 0 | 0 | 0 | 0 | 632 |
| Basketball | 22 | 37 | 3 | 5 | 67 | 32 | 19 | 16 | 67 | 16 | 0 | 10 | 41 | 67 | 0 | 0 | 0 | 0 | 67 | 0 | 0 | 0 | 0 | 0 | 0 | 67 |
| Books | 87 | 229 | 14 | 75 | 405 | 179 | 129 | 97 | 405 | 53 | 44 | 64 | 244 | 405 | 0 | 405 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 405 |
| Boxing & MMA | 84 | 238 | 21 | 80 | 423 | 219 | 125 | 79 | 423 | 67 | 29 | 95 | 232 | 423 | 0 | 0 | 0 | 0 | 0 | 423 | 0 | 0 | 0 | 0 | 0 | 423 |
| CDs | 67 | 141 | 8 | 55 | 271 | 150 | 83 | 38 | 271 | 45 | 18 | 33 | 175 | 271 | 0 | 0 | 271 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 271 |
| Cameras | 126 | 344 | 30 | 92 | 592 | 321 | 158 | 113 | 592 | 152 | 22 | 115 | 303 | 592 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 592 | 592 |
| Camping & Hiking | 3147 | 7487 | 572 | 2523 | 13729 | 7043 | 4213 | 2473 | 13729 | 2057 | 738 | 2668 | 8266 | 13729 | 0 | 0 | 0 | 13729 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13729 |
| Cardio Equipment | 2921 | 6805 | 539 | 2222 | 12487 | 6513 | 3786 | 2188 | 12487 | 1923 | 661 | 2406 | 7497 | 12487 | 0 | 0 | 0 | 0 | 0 | 12487 | 0 | 0 | 0 | 0 | 0 | 12487 |
| Children's Clothing | 157 | 348 | 30 | 117 | 652 | 333 | 211 | 108 | 652 | 78 | 27 | 146 | 401 | 652 | 652 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 652 |
| Cleats | 5570 | 13496 | 1037 | 4448 | 24551 | 12700 | 7347 | 4504 | 24551 | 3745 | 1353 | 4816 | 14637 | 24551 | 24551 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24551 |
| Computers | 117 | 224 | 17 | 84 | 442 | 233 | 137 | 72 | 442 | 55 | 16 | 71 | 300 | 442 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 442 | 442 |
| Consumer Electronics | 101 | 238 | 22 | 70 | 431 | 228 | 135 | 68 | 431 | 62 | 33 | 101 | 235 | 431 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 431 | 431 |
| Crafts | 104 | 271 | 26 | 83 | 484 | 276 | 140 | 68 | 484 | 85 | 14 | 100 | 285 | 484 | 484 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 484 |
| DVDs | 115 | 259 | 24 | 85 | 483 | 261 | 154 | 68 | 483 | 62 | 16 | 89 | 316 | 483 | 0 | 0 | 483 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 483 |
| Electronics | 733 | 1770 | 132 | 521 | 3156 | 1686 | 929 | 541 | 3156 | 499 | 176 | 599 | 1882 | 3156 | 0 | 0 | 0 | 0 | 0 | 1127 | 0 | 0 | 2029 | 0 | 0 | 3156 |
| Fishing | 4034 | 9516 | 730 | 3045 | 17325 | 9023 | 5237 | 3065 | 17325 | 2715 | 868 | 3314 | 10428 | 17325 | 0 | 0 | 0 | 17325 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17325 |
| Fitness Accessories | 69 | 176 | 13 | 51 | 309 | 151 | 90 | 68 | 309 | 50 | 13 | 60 | 186 | 309 | 0 | 0 | 0 | 0 | 0 | 309 | 0 | 0 | 0 | 0 | 0 | 309 |
| Garden | 116 | 270 | 18 | 80 | 484 | 261 | 160 | 63 | 484 | 73 | 29 | 87 | 295 | 484 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 484 | 0 | 0 | 484 |
| Girls' Apparel | 284 | 664 | 64 | 189 | 1201 | 604 | 384 | 213 | 1201 | 200 | 59 | 229 | 713 | 1201 | 0 | 0 | 0 | 0 | 0 | 0 | 1201 | 0 | 0 | 0 | 0 | 1201 |
| Golf Apparel | 116 | 229 | 12 | 84 | 441 | 233 | 133 | 75 | 441 | 58 | 22 | 77 | 284 | 441 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 441 | 0 | 0 | 441 |
| Golf Bags & Carts | 12 | 42 | 0 | 7 | 61 | 32 | 16 | 13 | 61 | 19 | 1 | 13 | 28 | 61 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 61 | 0 | 0 | 61 |
| Golf Balls | 357 | 805 | 72 | 241 | 1475 | 786 | 417 | 272 | 1475 | 209 | 88 | 277 | 901 | 1475 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1475 | 0 | 0 | 1475 |
| Golf Gloves | 240 | 602 | 41 | 187 | 1070 | 540 | 338 | 192 | 1070 | 160 | 59 | 211 | 640 | 1070 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1070 | 0 | 0 | 1070 |
| Golf Shoes | 120 | 291 | 22 | 91 | 524 | 275 | 160 | 89 | 524 | 78 | 33 | 108 | 305 | 524 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 524 | 0 | 0 | 524 |
| Health and Beauty | 83 | 202 | 16 | 61 | 362 | 190 | 117 | 55 | 362 | 67 | 14 | 66 | 215 | 362 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 362 | 0 | 0 | 0 | 362 |
| Hockey | 158 | 329 | 26 | 101 | 614 | 327 | 185 | 102 | 614 | 98 | 41 | 93 | 382 | 614 | 0 | 0 | 0 | 0 | 614 | 0 | 0 | 0 | 0 | 0 | 0 | 614 |
| Hunting & Shooting | 101 | 247 | 16 | 76 | 440 | 234 | 129 | 77 | 440 | 57 | 38 | 88 | 257 | 440 | 0 | 0 | 0 | 440 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 440 |
| Indoor/Outdoor Games | 4451 | 10565 | 828 | 3454 | 19298 | 9959 | 5974 | 3365 | 19298 | 2927 | 1087 | 3750 | 11534 | 19298 | 0 | 0 | 0 | 19298 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 19298 |
| Kids' Golf Clubs | 84 | 205 | 28 | 67 | 384 | 198 | 114 | 72 | 384 | 53 | 21 | 68 | 242 | 384 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 384 | 0 | 0 | 384 |
| Lacrosse | 75 | 206 | 14 | 48 | 343 | 182 | 98 | 63 | 343 | 51 | 16 | 70 | 206 | 343 | 0 | 0 | 0 | 0 | 343 | 0 | 0 | 0 | 0 | 0 | 0 | 343 |
| Men's Clothing | 51 | 113 | 10 | 34 | 208 | 111 | 66 | 31 | 208 | 37 | 13 | 32 | 126 | 208 | 208 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 208 |
| Men's Footwear | 5104 | 12121 | 983 | 4038 | 22246 | 11472 | 6817 | 3957 | 22246 | 3366 | 1182 | 4410 | 13288 | 22246 | 22246 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22246 |
| Men's Golf Clubs | 83 | 135 | 9 | 56 | 283 | 138 | 89 | 56 | 283 | 35 | 11 | 50 | 187 | 283 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 283 | 0 | 0 | 283 |
| Music | 97 | 248 | 17 | 72 | 434 | 183 | 127 | 124 | 434 | 73 | 22 | 83 | 256 | 434 | 0 | 0 | 434 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 434 |
| Pet Supplies | 91 | 290 | 20 | 91 | 492 | 241 | 161 | 90 | 492 | 84 | 40 | 119 | 249 | 492 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 492 | 0 | 492 |
| Shop By Sport | 2536 | 6058 | 469 | 1921 | 10984 | 5766 | 3267 | 1951 | 10984 | 1725 | 598 | 2157 | 6504 | 10984 | 0 | 0 | 0 | 0 | 0 | 0 | 10984 | 0 | 0 | 0 | 0 | 10984 |
| Soccer | 28 | 75 | 2 | 33 | 138 | 75 | 37 | 26 | 138 | 15 | 8 | 31 | 84 | 138 | 0 | 0 | 0 | 0 | 138 | 0 | 0 | 0 | 0 | 0 | 0 | 138 |
| Sporting Goods | 83 | 198 | 21 | 55 | 357 | 164 | 136 | 57 | 357 | 59 | 21 | 71 | 206 | 357 | 0 | 0 | 0 | 0 | 357 | 0 | 0 | 0 | 0 | 0 | 0 | 357 |
| Strength Training | 22 | 64 | 2 | 23 | 111 | 58 | 23 | 30 | 111 | 14 | 4 | 22 | 71 | 111 | 0 | 0 | 0 | 0 | 0 | 111 | 0 | 0 | 0 | 0 | 0 | 111 |
| Tennis & Racquet | 65 | 183 | 14 | 66 | 328 | 165 | 112 | 51 | 328 | 53 | 16 | 70 | 189 | 328 | 0 | 0 | 0 | 0 | 328 | 0 | 0 | 0 | 0 | 0 | 0 | 328 |
| Toys | 123 | 291 | 22 | 93 | 529 | 265 | 143 | 121 | 529 | 85 | 24 | 98 | 322 | 529 | 0 | 0 | 0 | 529 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 529 |
| Trade-In | 232 | 542 | 48 | 152 | 974 | 468 | 327 | 179 | 974 | 146 | 42 | 214 | 572 | 974 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 974 | 0 | 0 | 974 |
| Video Games | 190 | 455 | 34 | 159 | 838 | 455 | 207 | 176 | 838 | 96 | 47 | 175 | 520 | 838 | 0 | 0 | 838 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 838 |
| Water Sports | 3547 | 8517 | 660 | 2816 | 15540 | 8111 | 4677 | 2752 | 15540 | 2408 | 860 | 3095 | 9177 | 15540 | 0 | 0 | 0 | 15540 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15540 |
| Women's Apparel | 4886 | 11476 | 919 | 3754 | 21035 | 10864 | 6418 | 3753 | 21035 | 3351 | 1120 | 4038 | 12526 | 21035 | 0 | 0 | 0 | 0 | 0 | 0 | 21035 | 0 | 0 | 0 | 0 | 21035 |
| Women's Clothing | 145 | 367 | 19 | 119 | 650 | 328 | 194 | 128 | 650 | 88 | 39 | 152 | 371 | 650 | 650 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 650 |
| Women's Golf Clubs | 47 | 97 | 11 | 26 | 181 | 95 | 51 | 35 | 181 | 26 | 6 | 39 | 110 | 181 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 181 | 0 | 0 | 181 |
| Total | 41592 | 98977 | 7754 | 32196 | 180519 | 93504 | 54789 | 32226 | 180519 | 27814 | 9737 | 35216 | 107752 | 180519 | 48998 | 405 | 2026 | 66861 | 2479 | 14525 | 33220 | 362 | 9686 | 492 | 1465 | 180519 |
from sklearn.preprocessing import LabelEncoder
categorical_columns = df_ship.select_dtypes(include = ['object']).columns.tolist()
df_ship_encoded = df_ship.copy()
encoder=LabelEncoder()
for col in categorical_columns:
df_ship_encoded[col] = encoder.fit_transform(df_ship[col])
from matplotlib.colors import LinearSegmentedColormap
# Reset background style
sns.set_style('whitegrid')
# Calculate the correlation matrix excluding the 'CustomerID' column
corr = df_ship_encoded.corr().round(2)
# Define a custom colormap
colors = ['#E5F9F7', '#B4F1EB', '#7DC2BB', '#237A71', '#033530']
my_cmap = LinearSegmentedColormap.from_list('custom_map', colors, N=256)
# Create a mask to only show the lower triangle of the matrix (since it's mirrored around its
# top-left to bottom-right diagonal)
# mask = np.zeros_like(corr)
# mask[np.triu_indices_from(mask, k=1)] = True
# Plot the heatmap
plt.figure(figsize=(15, 15))
sns.heatmap(corr, cmap=my_cmap, annot=True, center=0, fmt='.2f', linewidths=1, annot_kws={"size": 10})
plt.title('Correlation Matrix', fontsize=18)
plt.show()
X = df_ship_encoded.drop(['Days_for_Shipping_Real','Days_for_Shipping_Scheduled'], axis=1)
y = df_ship_encoded[['Days_for_Shipping_Real','Days_for_Shipping_Scheduled']]
X.shape, y.shape
((180516, 28), (180516, 2))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xg
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
# Define regressors in a dictionary
regressors = {
'Decision Tree': DecisionTreeRegressor(random_state=42),
'Random Forest': RandomForestRegressor(random_state=42, n_jobs=-1), # Leverage multiple cores if available
# 'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=5, n_jobs=-1), # Adjust n_neighbors as needed
'XGBoost': xg.XGBRegressor(objective='reg:linear')
}
# Evaluate and compare models using a loop
results = {}
for name, model in regressors.items():
print('-'*25)
print('-'*25)
print(name)
print('-'*25)
print('-'*25)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
results[name] = {
'R-Squared': r2_score(y_test, y_pred),
'Mean Squared Error': mean_squared_error(y_test, y_pred),
'Mean Absolute Error': mean_absolute_error(y_test, y_pred)
}
------------------------- ------------------------- Decision Tree ------------------------- ------------------------- ------------------------- ------------------------- Random Forest ------------------------- ------------------------- ------------------------- ------------------------- XGBoost ------------------------- -------------------------
# Print detailed results for each model
for name, metrics in results.items():
print(f"\n**{name} Model Performance:**")
for metric, value in metrics.items():
print(f"{metric}: {round(value, 4)}")
# Function to visualize results using a bar chart
def visualize_results(results, metric_name):
model_names = list(results.keys())
metric_values = [results[model][metric_name] for model in model_names]
plt.figure(figsize=(10, 6))
plt.bar(model_names, metric_values)
plt.xlabel('Model')
plt.ylabel(metric_name)
plt.title(f"{metric_name} Comparison")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# Example usage: Visualize R-Squared scores
visualize_results(results, 'R-Squared')
# You can call visualize_results for other metrics as well (e.g., 'Mean Squared Error', 'Mean Absolute Error')
**Decision Tree Model Performance:** R-Squared: 0.9845 Mean Squared Error: 0.0409 Mean Absolute Error: 0.0298 **Random Forest Model Performance:** R-Squared: 0.9922 Mean Squared Error: 0.0206 Mean Absolute Error: 0.0603 **XGBoost Model Performance:** R-Squared: 0.9982 Mean Squared Error: 0.0048 Mean Absolute Error: 0.0291
df_fraud = df_ship.copy()
df_fraud.head()
| Transaction_Type | Days_for_Shipping_Real | Days_for_Shipping_Scheduled | Delivery_Status | Late_Delivery_Risk | Category_Id | Customer_Segment | Customer_Zipcode | Department_Id | Latitude | Longitude | Order_City | Order_Country | Order_Item_Discount_Percentage | Order_Item_Id | Order_Item_Profit_Ratio | Order_Item_Quantity | Order_Total_Before_Discount | Order_Profit_Per_Order | Order_State | Order_Status | Product_Id | Shipping_Mode | Order_Day | Order_Weekday | Order_Month | Order_Year | Shipping_Year | Shipping_Month | Shipping_Day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEBIT | 3 | 4 | Advance shipping | 0 | 73 | Consumer | 725 | 2 | 18.251453 | -66.037056 | Bekasi | Indonesia | 4.0 | 180517 | 0.29 | 1 | 327.75 | 91.25 | Java Occidental | COMPLETE | 1360 | Standard Class | 31 | 2 | 1 | 2018 | 2018 | 2 | 3 |
| 1 | TRANSFER | 5 | 4 | Late delivery | 1 | 73 | Consumer | 725 | 2 | 18.279451 | -66.037064 | Bikaner | India | 5.0 | 179254 | -0.80 | 1 | 327.75 | -249.09 | Rajastan | PENDING | 1360 | Standard Class | 13 | 5 | 1 | 2018 | 2018 | 1 | 18 |
| 2 | CASH | 4 | 4 | Shipping on time | 0 | 73 | Consumer | 95125 | 2 | 37.292233 | -121.881279 | Bikaner | India | 6.0 | 179253 | -0.80 | 1 | 327.75 | -247.78 | Rajastan | CLOSED | 1360 | Standard Class | 13 | 5 | 1 | 2018 | 2018 | 1 | 17 |
| 3 | DEBIT | 3 | 4 | Advance shipping | 0 | 73 | Home Office | 90027 | 2 | 34.125946 | -118.291016 | Townsville | Australia | 7.0 | 179252 | 0.08 | 1 | 327.75 | 22.86 | Queensland | COMPLETE | 1360 | Standard Class | 13 | 5 | 1 | 2018 | 2018 | 1 | 16 |
| 4 | PAYMENT | 2 | 4 | Advance shipping | 0 | 73 | Corporate | 725 | 2 | 18.253769 | -66.037048 | Townsville | Australia | 9.0 | 179251 | 0.45 | 1 | 327.75 | 134.21 | Queensland | PENDING_PAYMENT | 1360 | Standard Class | 13 | 5 | 1 | 2018 | 2018 | 1 | 15 |
df_fraud['SUSPECTED_FRAUD'] = np.where(df_fraud['Order_Status'] == 'SUSPECTED_FRAUD', 1, 0)
from sklearn.preprocessing import LabelEncoder
categorical_columns = df_fraud.select_dtypes(include = ['object']).columns.tolist()
df_fraud_encoded = df_fraud.copy()
encoder=LabelEncoder()
for col in categorical_columns:
df_fraud_encoded[col] = encoder.fit_transform(df_fraud_encoded[col])
X = df_fraud_encoded.drop(['SUSPECTED_FRAUD','Order_Status'], axis=1)
y = df_fraud_encoded['SUSPECTED_FRAUD']
X.shape, y.shape
((180516, 29), (180516,))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import *
from sklearn import metrics
from sklearn.metrics import classification_report
lgr_pipline = Pipeline([("scaler", StandardScaler()),
("LogisticRegression", LogisticRegression())])
rfc_pipline = Pipeline([("scaler", StandardScaler()),
("RandomForestClassifier", RandomForestClassifier(n_jobs=-1))])
gnb_pipline = Pipeline([("scaler", StandardScaler()),
("GaussianNB", GaussianNB())])
sgd_pipline = Pipeline([("scaler", StandardScaler()),
("SGDClassifier", SGDClassifier())])
dt_pipline = Pipeline([("scaler", StandardScaler()),
("DecisionTreeClassifier", DecisionTreeClassifier())])
pip_dict1 ={0:'Logistic Regression',
1:'RandomForestClassifier',
2: 'GaussianNB',
3: 'SGDClassifier',
4: 'DecisionTreeClassifier'}
piplines1 = [lgr_pipline, rfc_pipline, gnb_pipline, sgd_pipline, dt_pipline ]
scores_df = pd.DataFrame(columns = ["Model", "CVScores"])
for i, pipe in enumerate(piplines1):
score = cross_val_score(pipe, X, y, cv = 10)
print(pip_dict1[i], ": ", score.mean())
Logistic Regression : 0.9764840000872532 RandomForestClassifier : 0.9852366849948485 GaussianNB : 0.9795472823430347 SGDClassifier : 0.9753594599014047 DecisionTreeClassifier : 0.9881283160056528
# Only run this code for the 1st time to find the best parameters
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import accuracy_score, classification_report
# # Define the pipeline
# pipeline = Pipeline([
# ("scaler", StandardScaler()),
# ("classifier", RandomForestClassifier())
# ])
# # Hyperparameter grid for random search
# grid_params = {
# "classifier__n_estimators": [50, 100, 150, 200, 250, 300],
# "classifier__criterion": ["gini", "entropy"],
# "classifier__max_features": ["auto", "sqrt", "log2"],
# }
# # Randomized search with cross-validation
# random_search = RandomizedSearchCV(
# estimator=pipeline,
# param_distributions=grid_params,
# scoring="neg_mean_absolute_error", # Minimize mean absolute error
# n_jobs=-1, # Use all cores for parallelization
# cv=8, # 8-fold cross-validation
# verbose=10, # Print details every 10 iterations
# random_state=42,
# )
# # Perform hyperparameter tuning
# random_search.fit(X_train, y_train)
# # Print best parameters
# print("Best parameters found:", random_search.best_params_)
# # Get the best model
# best_model = random_search.best_estimator_
Best parameters found: {'classifiern_estimators': 200, 'classifiermax_features': 'auto', 'classifier__criterion': 'entropy'}
# Created the best model with the best parameters found
best_model = RandomForestClassifier(n_estimators = 200,
max_features = 'auto',
criterion = 'entropy')
# Train the best model on the entire training data
best_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = best_model.predict(X_test)
# Evaluate model performance
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
Accuracy: 0.9955017837753994
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 44070
1 0.93 0.88 0.90 1059
accuracy 1.00 45129
macro avg 0.96 0.94 0.95 45129
weighted avg 1.00 1.00 1.00 45129